Rey et al. 2020 data analysis

Running this notebook

To do.

Importing data

The PacMAN pipeline exports results as a phyloseq object which can be imported into R for analysis. This is how phyloseq objects are structured:

Read the phyloseq object and verify that we have a OTU table, a sample table, and a taxonomy table. A phylogenetic tree has not been calculated so that slot is empty.

physeq <- readRDS("../results_noblast/phyloseq_object.rds")

physeq
## phyloseq-class experiment-level object
## otu_table()   OTU Table:         [ 5327 taxa and 22 samples ]
## sample_data() Sample Data:       [ 22 samples by 20 sample variables ]
## tax_table()   Taxonomy Table:    [ 5327 taxa by 14 taxonomic ranks ]

While we can access these tables individually, the phyloseq package also has a function psmelt() to convert the phyloseq object into a single large data frame:

library(dplyr)

df <- psmelt(physeq) %>%
  mutate_if(is.character, list(~na_if(., ""))) %>%
  as_tibble()

df
## # A tibble: 117,194 × 37
##    OTU    Sample Abund…¹ eventID mater…² event…³ verba…⁴ local…⁵ verba…⁶ event…⁷
##    <chr>  <chr>    <int> <chr>   <chr>   <chr>   <chr>   <chr>   <chr>   <chr>  
##  1 asv.2  SAMN1…   25147 SAMN10… SAMN10… settle… 43.28 … Spain:… 7m      2017-1…
##  2 asv.7  SAMN1…   16275 SAMN10… SAMN10… settle… 43.28 … Spain:… 7m      2017-0…
##  3 asv.8  SAMN1…   15862 SAMN10… SAMN10… settle… 43.37 … Spain:… 7m      2017-0…
##  4 asv.6  SAMN1…   15401 SAMN10… SAMN10… filter… 43.34 … Spain:… surface 2017-0…
##  5 asv.5  SAMN1…   12944 SAMN10… SAMN10… settle… 43.37 … Spain:… 7m      2017-1…
##  6 asv.11 SAMN1…   12292 SAMN10… SAMN10… settle… 43.34 … Spain:… 7m      2017-0…
##  7 asv.14 SAMN1…   11142 SAMN10… SAMN10… filter… 43.37 … Spain:… surface 2017-0…
##  8 asv.10 SAMN1…   10447 SAMN10… SAMN10… filter… 43.37 … Spain:… surface 2017-0…
##  9 asv.3  SAMN1…   10377 SAMN10… SAMN10… settle… 43.34 … Spain:… 7m      2017-1…
## 10 asv.18 SAMN1…    9850 SAMN10… SAMN10… settle… 43.37 … Spain:… 7m      2017-0…
## # … with 117,184 more rows, 27 more variables: occurrenceStatus <chr>,
## #   target_gene <chr>, subfragment <chr>, pcr_primer_forward <chr>,
## #   pcr_primer_reverse <chr>, pcr_primer_name_forw <chr>,
## #   pcr_primer_name_reverse <chr>, pcr_primer_reference <chr>,
## #   lib_layout <chr>, seq_meth <chr>, sop <chr>, votu_db <chr>, My_name <chr>,
## #   kingdom <chr>, phylum <chr>, class <chr>, order <chr>, family <chr>,
## #   genus <chr>, species <chr>, lastvalue <chr>, otu_seq_comp_appr <chr>, …

Exploratory data analysis

Abundance by sample type and phylum

library(tidyr)

stats <- df %>%
  group_by(eventRemarks, phylum) %>%
  summarize(abundance = sum(Abundance))

spread(stats, eventRemarks, abundance) %>%
  mutate(total = `settlement plates` + `filtered water`) %>%
  arrange(desc(total)) %>%
  knitr::kable()
phylum filtered water settlement plates total
NA 353725 126557 480282
Arthropoda 13857 99041 112898
Cnidaria 3603 60871 64474
Bacillariophyta 45861 979 46840
Annelida 8706 15085 23791
Bryozoa 454 22631 23085
Nemertea 7 20234 20241
Chlorophyta 17922 10 17932
Mollusca 3801 6219 10020
Haptista 4682 13 4695
Rhodophyta 3082 197 3279
Rotifera 1926 56 1982
Oomycota 1303 414 1717
Basidiomycota 1481 18 1499
Chordata 1213 211 1424
Platyhelminthes 11 1382 1393
Prasinodermophyta 1071 0 1071
Echinodermata 767 53 820
Ascomycota 733 3 736
Streptophyta 444 0 444
Porifera 281 115 396
Discosea 173 63 236
Nematoda 21 95 116
Placozoa 96 0 96
Phoronida 57 18 75
Evosea 62 7 69
Gastrotricha 6 32 38
Picozoa 28 0 28
Chytridiomycota 18 0 18
Sipuncula 12 0 12
Imbricatea 0 7 7
Chaetognatha 5 0 5
Entoprocta 0 4 4
Tubulinea 3 0 3
Onychophora 2 0 2
Zoopagomycota 2 0 2
library(ggplot2)

stats_simplified <- stats %>%
  mutate(
    phylum = case_when(phylum = abundance < 5000 ~ "Other", TRUE ~ phylum)
  ) %>%
  group_by(eventRemarks) %>%
  mutate(abundance = abundance / sum(abundance)) %>%
  group_by(eventRemarks, phylum) %>%
  summarize(abundance = sum(abundance))
  
ggplot() +
  geom_bar(data = stats_simplified, aes(y = eventRemarks, fill = phylum, x = abundance), stat = "identity") +
  scale_fill_brewer(palette = "Spectral", na.value = "#eeeeee") + theme_minimal()

Abundance (reads) by sample and phylum

stats <- df %>%
  rename(sample = Sample) %>%
  group_by(sample, eventRemarks, phylum) %>%
  summarize(abundance = sum(Abundance))

stats_simplified <- stats %>%
  mutate(
    phylum = case_when(phylum = abundance < 2000 ~ "Other", TRUE ~ phylum)
  ) %>%
  group_by(sample) %>%
  mutate(relative_abundance = abundance / sum(abundance)) %>%
  group_by(sample, eventRemarks, phylum) %>%
  summarize(abundance = sum(abundance), relative_abundance = sum(relative_abundance))
  
ggplot() +
  geom_bar(data = stats_simplified, aes(y = sample, fill = phylum, x = relative_abundance), stat = "identity") +
  scale_fill_brewer(palette = "Spectral", na.value = "#eeeeee") +
  theme_minimal() +
  theme(panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())

ggplot() +
  geom_bar(data = stats_simplified, aes(y = sample, fill = phylum, x = abundance), stat = "identity") +
  scale_fill_brewer(palette = "Spectral", na.value = "#eeeeee") +
  theme_minimal() +
  theme(panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())

Diversity (ASVs) by sample and phylum

stats <- df %>%
  filter(Abundance > 0) %>%
  rename(sample = Sample) %>%
  group_by(sample, eventRemarks, phylum) %>%
  summarize(asvs = n())

stats_simplified <- stats %>%
  mutate(
    phylum = case_when(phylum = asvs < 16 ~ "Other", TRUE ~ phylum)
  ) %>%
  group_by(sample, eventRemarks, phylum) %>%
  summarize(asvs = sum(asvs))
  
ggplot() +
  geom_bar(data = stats_simplified, aes(y = sample, fill = phylum, x = asvs), stat = "identity") +
  scale_fill_brewer(palette = "Spectral", na.value = "#eeeeee") +
  theme_minimal() +
  theme(panel.grid.major.y = element_blank(), panel.grid.minor.y = element_blank())

Most abundant species

Let’s list the species with the highest relative abundance across all samples:

top_species <- df %>%
  filter(!is.na(species)) %>%
  group_by(species) %>%
  summarize(abundance = sum(Abundance)) %>%
  arrange(desc(abundance)) %>%
  head(20)

top_species %>% knitr::kable()
species abundance
Balanus trigonus 37896
Bougainvillia muscus 33490
Minutocellus polymorphus 27149
Obelia dichotoma 19006
Emplectonema gracile 16805
Micromonas pusilla 15598
Dictyocha speculum 13538
Bugula neritina 10243
Harpyia umbrosa 9099
Cutleria multifida 5503
Ostrea stentina 5267
Polydora neocaeca 5066
Paracalanus parvus 4374
Sabellaria spinulosa 3627
Amphibalanus eburneus 3567
Palmaria decipiens 2398
Amphinema dinema 2084
Chloroparvula pacifica 1528
Pseudochattonella farcimen 1509
Campanularia hincksii 1478

For the most abundant species, plot the abundance by sample:

stats <- df %>%
  filter(species %in% top_species$species) %>%
  group_by(species, eventRemarks, Sample) %>%
  summarize(abundance = sum(Abundance))

ggplot() +
  geom_jitter(data = stats, aes(y = species, x = abundance, color = eventRemarks), pch = 21, height = 0.1, width = 0) +
  scale_color_brewer(palette = "Set1") +
  theme_minimal()

Invasive species

In this section we will check our results against the World Register of Introduced Marine Species (WRiMS). The repository contains a CSV file containing the LSIDs for all species in WRiMS.

wrims_lsids <- read.csv("wrims_lsids.csv")

invasives <- df %>%
  filter(lsid %in% wrims_lsids$lsid)

invasives %>%
  group_by(phylum, species) %>%
  summarize(abundance = sum(Abundance)) %>%
  arrange(desc(abundance)) %>%
  rmarkdown::paged_table()

ASV accumulation curves

To do.

Alpha and beta diversity

To do.

Multidimensional scaling

ord <- ordinate(physeq = physeq, method = "PCoA", distance = "bray")

plot_ordination(physeq = physeq, ordination = ord, type = "samples", color = "eventRemarks", shape = "locality") +
  geom_point(aes(color = eventRemarks), alpha = 1, size = 4) +
  geom_text(aes(label = materialSampleID), alpha = 0.7, size = 3, vjust = 2) +
  stat_ellipse(aes(group = eventRemarks)) +
  scale_color_brewer(palette = "Set1") +
  theme_classic()

# todo: Canonical analysis of principal coordinates